{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 1. 观察Otto商品的特征进行PCA各维的方差，可以得到什么结论？（20分） "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "#导入必要的工具包\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "from sklearn.decomposition import PCA"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>feat_1</th>\n",
       "      <th>feat_2</th>\n",
       "      <th>feat_3</th>\n",
       "      <th>feat_4</th>\n",
       "      <th>feat_5</th>\n",
       "      <th>feat_6</th>\n",
       "      <th>feat_7</th>\n",
       "      <th>feat_8</th>\n",
       "      <th>feat_9</th>\n",
       "      <th>...</th>\n",
       "      <th>feat_85</th>\n",
       "      <th>feat_86</th>\n",
       "      <th>feat_87</th>\n",
       "      <th>feat_88</th>\n",
       "      <th>feat_89</th>\n",
       "      <th>feat_90</th>\n",
       "      <th>feat_91</th>\n",
       "      <th>feat_92</th>\n",
       "      <th>feat_93</th>\n",
       "      <th>target</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>Class_1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>Class_1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>Class_1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>6</td>\n",
       "      <td>1</td>\n",
       "      <td>5</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>2</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>Class_1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>...</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>1</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>0</td>\n",
       "      <td>Class_1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 95 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   id  feat_1  feat_2  feat_3  feat_4  feat_5  feat_6  feat_7  feat_8  feat_9  \\\n",
       "0   1       1       0       0       0       0       0       0       0       0   \n",
       "1   2       0       0       0       0       0       0       0       1       0   \n",
       "2   3       0       0       0       0       0       0       0       1       0   \n",
       "3   4       1       0       0       1       6       1       5       0       0   \n",
       "4   5       0       0       0       0       0       0       0       0       0   \n",
       "\n",
       "   ...  feat_85  feat_86  feat_87  feat_88  feat_89  feat_90  feat_91  \\\n",
       "0  ...        1        0        0        0        0        0        0   \n",
       "1  ...        0        0        0        0        0        0        0   \n",
       "2  ...        0        0        0        0        0        0        0   \n",
       "3  ...        0        1        2        0        0        0        0   \n",
       "4  ...        1        0        0        0        0        1        0   \n",
       "\n",
       "   feat_92  feat_93   target  \n",
       "0        0        0  Class_1  \n",
       "1        0        0  Class_1  \n",
       "2        0        0  Class_1  \n",
       "3        0        0  Class_1  \n",
       "4        0        0  Class_1  \n",
       "\n",
       "[5 rows x 95 columns]"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#读取训练数据\n",
    "dpath = './data/'\n",
    "train = pd.read_csv(dpath +\"Otto_train.csv\")\n",
    "train.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "y_train = train['target']   \n",
    "X_train = train.drop([\"id\", \"target\"], axis=1)\n",
    "\n",
    "#用于存储pca变换后的特征\n",
    "train_id = train['id']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "pca = PCA(n_components = 0.85)\n",
    "pca.fit(X_train)\n",
    "    \n",
    "# 在训练集降维 \n",
    "X_train_pca = pca.transform(X_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "34\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "<BarContainer object of 34 artists>"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAX8AAAD4CAYAAAAEhuazAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy8QZhcZAAASJ0lEQVR4nO3df6xf913f8edrdpNCypI0MVNxEtmQDHC7qoNbF2ldhwgNDhU1CIc6HSWZMhkkLIG6AS7a0tQrUoKAgETG5pFA2lDcLFC4IoZQKWVsqATfpGlaJ4TeBq+5ddW4OC3LUJq6ee+P7/H61Tf3x7k/fL9f38/zIVk+53M+59z39yh+nXM/53w/SVUhSWrLPxp3AZKk9Wf4S1KDDH9JapDhL0kNMvwlqUGbx13AqEsvvbS2bds27jIk6Zzy8MMPf6GqtvTtP3Hhv23bNmZmZsZdhiSdU5L87+X0d9hHkhpk+EtSg3qFf5JdSZ5MMpvkwDzb35TkkSSnk+wZan9dko8mOZbksSRvW8viJUkrs2T4J9kE3AFcC+wArk+yY6TbZ4AbgQ+MtP8D8GNV9WpgF/CrSS5abdGSpNXp88B3JzBbVU8BJDkM7AYeP9Ohqo53214c3rGq/mZo+USSZ4AtwBdXXbkkacX6DPtsBZ4eWp/r2pYlyU7gPODT82zbl2QmyczJkyeXe2hJ0jL1Cf/M07asqUCTvAp4P/BvqurF0e1VdaiqpqpqasuW3q+pSpJWqE/4zwGXD61fBpzo+wOS/GPgfuA/VNVfLq88SdLZ0Cf8jwJXJdme5DxgLzDd5+Bd/w8B76uq/77yMiVJa2nJB75VdTrJfuABYBNwV1UdS3IQmKmq6SSvZxDyFwM/kOQ93Rs+PwK8CbgkyY3dIW+sqkfPxocB2Hbg/gW3Hb/1LWfrx0rSOaXX9A5VdQQ4MtJ289DyUQbDQaP73QPcs8oaJUlrzG/4SlKDDH9JapDhL0kNMvwlqUGGvyQ1yPCXpAYZ/pLUIMNfkhpk+EtSgwx/SWqQ4S9JDTL8JalBhr8kNcjwl6QGGf6S1CDDX5IaZPhLUoMMf0lqkOEvSQ0y/CWpQYa/JDXI8JekBhn+ktQgw1+SGmT4S1KDDH9JalCv8E+yK8mTSWaTHJhn+5uSPJLkdJI9I9tuSPKp7s8Na1W4JGnllgz/JJuAO4BrgR3A9Ul2jHT7DHAj8IGRfV8JvBt4A7ATeHeSi1dftiRpNfrc+e8EZqvqqap6ATgM7B7uUFXHq+ox4MWRfb8P+HBVnaqqZ4EPA7vWoG5J0ir0Cf+twNND63NdWx+99k2yL8lMkpmTJ0/2PLQkaaX6hH/maauex++1b1UdqqqpqprasmVLz0NLklaqT/jPAZcPrV8GnOh5/NXsK0k6S/qE/1HgqiTbk5wH7AWmex7/AeCaJBd3D3qv6dokSWO0ZPhX1WlgP4PQfgK4t6qOJTmY5K0ASV6fZA64DvivSY51+54C/hODC8hR4GDXJkkao819OlXVEeDISNvNQ8tHGQzpzLfvXcBdq6hRkrTG/IavJDXI8JekBhn+ktQgw1+SGmT4S1KDDH9JapDhL0kNMvwlqUGGvyQ1yPCXpAYZ/pLUIMNfkhpk+EtSgwx/SWqQ4S9JDTL8JalBhr8kNcjwl6QG9frfOG402w7cv+C247e+ZR0rkaTx8M5fkhpk+EtSgwx/SWqQ4S9JDTL8JalBhr8kNcjwl6QG9Qr/JLuSPJlkNsmBebafn+SD3faHkmzr2l+W5O4kn0jyRJJ3rW35kqSVWDL8k2wC7gCuBXYA1yfZMdLtJuDZqroSuB24rWu/Dji/qv4Z8J3Aj5+5MEiSxqfPnf9OYLaqnqqqF4DDwO6RPruBu7vl+4CrkwQo4IIkm4GvA14A/n5NKpckrVif8N8KPD20Pte1zdunqk4DXwIuYXAh+L/A54DPAL9UVadGf0CSfUlmksycPHly2R9CkrQ8fcI/87RVzz47ga8C3wRsB/5dkm9+SceqQ1U1VVVTW7Zs6VGSJGk1+oT/HHD50PplwImF+nRDPBcCp4C3A39SVV+pqmeAvwCmVlu0JGl1+oT/UeCqJNuTnAfsBaZH+kwDN3TLe4AHq6oYDPV8TwYuAL4L+Ou1KV2StFJLhn83hr8feAB4Ari3qo4lOZjkrV23O4FLkswC7wTOvA56B/AK4JMMLiK/VVWPrfFnkCQtU6/5/KvqCHBkpO3moeXnGbzWObrfc/O1S5LGy2/4SlKDDH9JapDhL0kNMvwlqUGGvyQ1yPCXpAYZ/pLUIMNfkhpk+EtSgwx/SWqQ4S9JDTL8JalBhr8kNcjwl6QGGf6S1CDDX5IaZPhLUoMMf0lqkOEvSQ0y/CWpQYa/JDXI8JekBm0edwGTatuB+xfcdvzWt6xjJZK09rzzl6QGGf6S1CDDX5Ia1Cv8k+xK8mSS2SQH5tl+fpIPdtsfSrJtaNtrk3w0ybEkn0jy8rUrX5K0Eks+8E2yCbgDeDMwBxxNMl1Vjw91uwl4tqquTLIXuA14W5LNwD3AO6rq40kuAb6y5p9iTHwoLOlc1efOfycwW1VPVdULwGFg90if3cDd3fJ9wNVJAlwDPFZVHweoqr+rqq+uTemSpJXqE/5bgaeH1ue6tnn7VNVp4EvAJcA/BSrJA0keSfKz8/2AJPuSzCSZOXny5HI/gyRpmfqEf+Zpq559NgNvBP519/cPJbn6JR2rDlXVVFVNbdmypUdJkqTV6BP+c8DlQ+uXAScW6tON818InOra/0dVfaGq/gE4AnzHaouWJK1On/A/ClyVZHuS84C9wPRIn2nghm55D/BgVRXwAPDaJF/fXRT+FfA4kqSxWvJtn6o6nWQ/gyDfBNxVVceSHARmqmoauBN4f5JZBnf8e7t9n03yKwwuIAUcqaqFX5GRJK2LXnP7VNURBkM2w203Dy0/D1y3wL73MHjdU5I0IfyGryQ1yPCXpAYZ/pLUIMNfkhpk+EtSgwx/SWqQ4S9JDTL8JalBhr8kNcjwl6QGGf6S1CDDX5IaZPhLUoMMf0lqkOEvSQ0y/CWpQYa/JDXI8JekBhn+ktQgw1+SGmT4S1KDDH9JatDmcRew0W07cP+C247f+pZ1rESSvsY7f0lqkOEvSQ0y/CWpQb3CP8muJE8mmU1yYJ7t5yf5YLf9oSTbRrZfkeS5JP9+bcqWJK3GkuGfZBNwB3AtsAO4PsmOkW43Ac9W1ZXA7cBtI9tvB/549eVKktZCnzv/ncBsVT1VVS8Ah4HdI312A3d3y/cBVycJQJIfBJ4Cjq1NyZKk1eoT/luBp4fW57q2eftU1WngS8AlSS4Afg54z2I/IMm+JDNJZk6ePNm3dknSCvV5zz/ztFXPPu8Bbq+q57pfBOZVVYeAQwBTU1Ojx97w/C6ApPXWJ/zngMuH1i8DTizQZy7JZuBC4BTwBmBPkl8ELgJeTPJ8Vf36qiuXJK1Yn/A/ClyVZDvwWWAv8PaRPtPADcBHgT3Ag1VVwL880yHJLcBzBr8kjd+S4V9Vp5PsBx4ANgF3VdWxJAeBmaqaBu4E3p9klsEd/96zWbQkaXV6ze1TVUeAIyNtNw8tPw9ct8QxbllBfZKks8Bv+EpSgwx/SWqQ4S9JDXI+/3OE3wWQtJa885ekBhn+ktQgw1+SGmT4S1KDDH9JapBv+2wgvhEkqS/v/CWpQYa/JDXI8JekBjnm3xifC0gC7/wlqUmGvyQ1yGEfvYRDQ9LG552/JDXI8JekBjnsoxVxaEg6t3nnL0kNMvwlqUGGvyQ1yPCXpAYZ/pLUIMNfkhrU61XPJLuAXwM2Ab9ZVbeObD8feB/wncDfAW+rquNJ3gzcCpwHvAD8TFU9uIb1a4L5Oqg0uZYM/ySbgDuANwNzwNEk01X1+FC3m4Bnq+rKJHuB24C3AV8AfqCqTiR5DfAAsHWtP4TOXV4gpPHoM+yzE5itqqeq6gXgMLB7pM9u4O5u+T7g6iSpqo9V1Ymu/Rjw8u63BEnSGPUJ/63A00Prc7z07v3/96mq08CXgEtG+vww8LGq+vLKSpUkrZU+Y/6Zp62W0yfJqxkMBV0z7w9I9gH7AK644ooeJaklDg1Ja69P+M8Blw+tXwacWKDPXJLNwIXAKYAklwEfAn6sqj493w+oqkPAIYCpqanRC4u0JC8Q0vL0Cf+jwFVJtgOfBfYCbx/pMw3cAHwU2AM8WFWV5CLgfuBdVfUXa1e2tHxeIKSvWXLMvxvD38/gTZ0ngHur6liSg0ne2nW7E7gkySzwTuBA174fuBL4j0ke7f5845p/CknSsvR6z7+qjgBHRtpuHlp+Hrhunv3eC7x3lTVK68bfDtQK5/OXlskLhDYCp3eQpAYZ/pLUIId9pLPAoSFNOu/8JalB3vlLY+JvBxon7/wlqUGGvyQ1yGEfaYI5NKSzxfCXznFeILQSDvtIUoO885ca4G8HGmX4SwL6XSC8iGwchr+kNeUF4tzgmL8kNcg7f0nrziGm8TP8JZ2zvECsnOEvaUNbq98yNtqFxvCXpDVyLl1EfOArSQ0y/CWpQYa/JDXI8JekBhn+ktQgw1+SGmT4S1KDDH9JalCv8E+yK8mTSWaTHJhn+/lJPthtfyjJtqFt7+ran0zyfWtXuiRppZYM/ySbgDuAa4EdwPVJdox0uwl4tqquBG4Hbuv23QHsBV4N7AL+c3c8SdIY9bnz3wnMVtVTVfUCcBjYPdJnN3B3t3wfcHWSdO2Hq+rLVfW3wGx3PEnSGKWqFu+Q7AF2VdW/7dbfAbyhqvYP9flk12euW/808AbgFuAvq+qerv1O4I+r6r6Rn7EP2Netfivw5Oo/GgCXAl9Yo2OtF2teH9a8Pqx5fVwKXFBVW/ru0Gdit8zTNnrFWKhPn32pqkPAoR61LEuSmaqaWuvjnk3WvD6seX1Y8/roat62nH36DPvMAZcPrV8GnFioT5LNwIXAqZ77SpLWWZ/wPwpclWR7kvMYPMCdHukzDdzQLe8BHqzBeNI0sLd7G2g7cBXwV2tTuiRppZYc9qmq00n2Aw8Am4C7qupYkoPATFVNA3cC708yy+COf2+377Ek9wKPA6eBn6yqr56lzzKfNR9KWgfWvD6seX1Y8/pYds1LPvCVJG08fsNXkhpk+EtSgzZs+C81JcUkSnI8ySeSPJpkZtz1zCfJXUme6b7bcabtlUk+nORT3d8Xj7PGUQvUfEuSz3bn+tEk3z/OGkcluTzJR5I8keRYkp/q2if2XC9S88Se6yQvT/JXST7e1fyern17N1XNp7qpa84bd61nLFLzbyf526Hz/LpFj7MRx/y7KST+Bngzg9dNjwLXV9XjYy1sCUmOA1NVNbFfMEnyJuA54H1V9Zqu7ReBU1V1a3ehvbiqfm6cdQ5boOZbgOeq6pfGWdtCkrwKeFVVPZLkG4CHgR8EbmRCz/UiNf8IE3quu5kILqiq55K8DPhfwE8B7wR+v6oOJ/kvwMer6jfGWesZi9T8E8AfjX6JdiEb9c6/z5QUWoGq+nMGb3QNG57e424G/+AnxgI1T7Sq+lxVPdIt/x/gCWArE3yuF6l5YtXAc93qy7o/BXwPg6lqYPLO80I1L8tGDf+twNND63NM+H+EnQL+NMnD3ZQX54p/UlWfg0EAAN845nr62p/ksW5YaGKGT0Z1s+T+c+AhzpFzPVIzTPC5TrIpyaPAM8CHgU8DX6yq012XicuP0Zqr6sx5/oXuPN+e5PzFjrFRw7/XtBIT6F9U1XcwmEH1J7vhCp0dvwF8C/A64HPAL4+3nPkleQXwe8BPV9Xfj7uePuapeaLPdVV9tapex2AGgp3At8/XbX2rWtxozUleA7wL+Dbg9cArgUWHAzdq+J+T00pU1Ynu72eAD3HuzID6+W6898y47zNjrmdJVfX57h/Qi8B/YwLPdTee+3vA71TV73fNE32u56v5XDjXAFX1ReDPgO8CLuqmqoEJzo+hmnd1w25VVV8GfoslzvNGDf8+U1JMlCQXdA/JSHIBcA3wycX3mhjD03vcAPzhGGvp5UyAdn6ICTvX3UO9O4EnqupXhjZN7LleqOZJPtdJtiS5qFv+OuB7GTyr+AiDqWpg8s7zfDX/9dBNQRg8o1j0PG/It30AutfJfpWvTUnxC2MuaVFJvpnB3T4Mpt34wCTWnOR3ge9mMIXs54F3A38A3AtcAXwGuK6qJuYB6wI1fzeDYYgCjgM/fmYsfRIkeSPwP4FPAC92zT/PYAx9Is/1IjVfz4Se6ySvZfBAdxODm+F7q+pg9+/xMIPhk48BP9rdUY/dIjU/CGxhMOz9KPATQw+GX3qcjRr+kqSFbdRhH0nSIgx/SWqQ4S9JDTL8JalBhr8kNcjwl6QGGf6S1KD/B4OG32nOUlN+AAAAAElFTkSuQmCC\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "# 绘制主成分占85%方差\n",
    "import matplotlib.pyplot as plt\n",
    "%matplotlib inline\n",
    "print(len(pca.explained_variance_ratio_))\n",
    "plt.bar(range(len(pca.explained_variance_ratio_)), pca.explained_variance_ratio_)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "从上图可见：降维后，34个主成分能解释的方差已经占到总体的85%，即使用36.6%的主成分即可代表85%的方差，选择所以可选择这34个主成分代表原始数据所有的特征。"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 2. 对Otto商品tfidf特征，进行PCA降维，给出各维方差的分布图。（30分）"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "scrolled": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>feat_1_tfidf</th>\n",
       "      <th>feat_2_tfidf</th>\n",
       "      <th>feat_3_tfidf</th>\n",
       "      <th>feat_4_tfidf</th>\n",
       "      <th>feat_5_tfidf</th>\n",
       "      <th>feat_6_tfidf</th>\n",
       "      <th>feat_7_tfidf</th>\n",
       "      <th>feat_8_tfidf</th>\n",
       "      <th>feat_9_tfidf</th>\n",
       "      <th>...</th>\n",
       "      <th>feat_85_tfidf</th>\n",
       "      <th>feat_86_tfidf</th>\n",
       "      <th>feat_87_tfidf</th>\n",
       "      <th>feat_88_tfidf</th>\n",
       "      <th>feat_89_tfidf</th>\n",
       "      <th>feat_90_tfidf</th>\n",
       "      <th>feat_91_tfidf</th>\n",
       "      <th>feat_92_tfidf</th>\n",
       "      <th>feat_93_tfidf</th>\n",
       "      <th>target</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>0.081393</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.075886</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>Class_1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>2</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.231403</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>Class_1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>3</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.199730</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>Class_1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>4</td>\n",
       "      <td>0.011987</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.011668</td>\n",
       "      <td>0.105971</td>\n",
       "      <td>0.021681</td>\n",
       "      <td>0.080435</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.008244</td>\n",
       "      <td>0.022456</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>Class_1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>5</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>...</td>\n",
       "      <td>0.124622</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.000000</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.145988</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>0.0</td>\n",
       "      <td>Class_1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 95 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "   id  feat_1_tfidf  feat_2_tfidf  feat_3_tfidf  feat_4_tfidf  feat_5_tfidf  \\\n",
       "0   1      0.081393           0.0           0.0      0.000000      0.000000   \n",
       "1   2      0.000000           0.0           0.0      0.000000      0.000000   \n",
       "2   3      0.000000           0.0           0.0      0.000000      0.000000   \n",
       "3   4      0.011987           0.0           0.0      0.011668      0.105971   \n",
       "4   5      0.000000           0.0           0.0      0.000000      0.000000   \n",
       "\n",
       "   feat_6_tfidf  feat_7_tfidf  feat_8_tfidf  feat_9_tfidf  ...  feat_85_tfidf  \\\n",
       "0      0.000000      0.000000      0.000000           0.0  ...       0.075886   \n",
       "1      0.000000      0.000000      0.231403           0.0  ...       0.000000   \n",
       "2      0.000000      0.000000      0.199730           0.0  ...       0.000000   \n",
       "3      0.021681      0.080435      0.000000           0.0  ...       0.000000   \n",
       "4      0.000000      0.000000      0.000000           0.0  ...       0.124622   \n",
       "\n",
       "   feat_86_tfidf  feat_87_tfidf  feat_88_tfidf  feat_89_tfidf  feat_90_tfidf  \\\n",
       "0       0.000000       0.000000            0.0            0.0       0.000000   \n",
       "1       0.000000       0.000000            0.0            0.0       0.000000   \n",
       "2       0.000000       0.000000            0.0            0.0       0.000000   \n",
       "3       0.008244       0.022456            0.0            0.0       0.000000   \n",
       "4       0.000000       0.000000            0.0            0.0       0.145988   \n",
       "\n",
       "   feat_91_tfidf  feat_92_tfidf  feat_93_tfidf   target  \n",
       "0            0.0            0.0            0.0  Class_1  \n",
       "1            0.0            0.0            0.0  Class_1  \n",
       "2            0.0            0.0            0.0  Class_1  \n",
       "3            0.0            0.0            0.0  Class_1  \n",
       "4            0.0            0.0            0.0  Class_1  \n",
       "\n",
       "[5 rows x 95 columns]"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# 读取数据\n",
    "dpath = './data/'\n",
    "train = pd.read_csv(dpath +\"Otto_FE_train_tfidf.csv\")\n",
    "train.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "y_train = train['target']   \n",
    "X_train = train.drop([\"id\", \"target\"], axis=1)\n",
    "\n",
    "#用于存储pca变换后的特征\n",
    "train_id = train['id']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "pca = PCA(n_components = 0.85)\n",
    "pca.fit(X_train)\n",
    "    \n",
    "# 在训练集降维 \n",
    "X_train_pca = pca.transform(X_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "48\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "<BarContainer object of 48 artists>"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXoAAAD4CAYAAADiry33AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy8QZhcZAAAP4UlEQVR4nO3cf6zdd13H8efLlm0IcYOtGlyLLVk1lIAotSMRlTAdnUOKsYsdqDWZmSY0wQhiZ8KAislmDNXEmdjQah3qtsxfN664LAx/hODoHZMfZS5c6mTXLeyOluE0W+l4+8f5LpycnfZ+23tvb/s5z0dyc7/fz/fzPefzSXNfn08/53w/qSokSe36juVugCRpaRn0ktQ4g16SGmfQS1LjDHpJatzK5W7AqEsuuaTWrl273M2QpHPK/fff/0RVrRp37awL+rVr1zI9Pb3czZCkc0qS/zrRNZduJKlxBr0kNc6gl6TGGfSS1DiDXpIaZ9BLUuMMeklqnEEvSY0z6CWpcWfdk7ELtXbnXc8re/imq5ehJZJ0dug1o0+yOclDSWaS7Bxz/ceTfCbJ8SRbR65tT/Kl7mf7YjVcktTPvEGfZAVwC3AVsAG4NsmGkWpfAX4Z+MuRe18KvB+4HNgEvD/JSxbebElSX31m9JuAmao6XFXHgNuALcMVqurhqvoc8K2Re98M3FNVR6rqKHAPsHkR2i1J6qlP0F8KPDJ0PtuV9dHr3iTXJ5lOMj03N9fzpSVJffQJ+owpq56v3+veqtpTVRurauOqVWO3U5YknaY+QT8LrBk6Xw082vP1F3KvJGkR9An6g8D6JOuSnAdsA6Z6vv7dwJVJXtJ9CHtlVyZJOkPmDfqqOg7sYBDQDwJ3VNWhJLuSvBUgyY8kmQWuAf4kyaHu3iPA7zAYLA4Cu7oySdIZ0uuBqao6ABwYKbtx6Pggg2WZcffuA/YtoI2SpAVwCwRJapxBL0mNM+glqXEGvSQ1zqCXpMYZ9JLUOINekhpn0EtS4wx6SWqcQS9JjTPoJalxBr0kNc6gl6TGGfSS1DiDXpIaZ9BLUuMMeklqnEEvSY0z6CWpcQa9JDXOoJekxhn0ktQ4g16SGmfQS1LjDHpJapxBL0mNM+glqXEGvSQ1zqCXpMYZ9JLUOINekhrXK+iTbE7yUJKZJDvHXD8/ye3d9fuSrO3KX5Bkf5LPJ3kwyQ2L23xJ0nzmDfokK4BbgKuADcC1STaMVLsOOFpVlwG7gZu78muA86vq1cDrgF99bhCQJJ0ZfWb0m4CZqjpcVceA24AtI3W2APu74zuBK5IEKOBFSVYCLwSOAd9YlJZLknrpE/SXAo8Mnc92ZWPrVNVx4EngYgah/7/AY8BXgN+vqiOjb5Dk+iTTSabn5uZOuROSpBPrE/QZU1Y962wCngW+F1gHvDvJK55XsWpPVW2sqo2rVq3q0SRJUl99gn4WWDN0vhp49ER1umWaC4EjwNuBf6yqb1bV48AngY0LbbQkqb8+QX8QWJ9kXZLzgG3A1EidKWB7d7wVuLeqisFyzZsy8CLg9cB/LE7TJUl9zBv03Zr7DuBu4EHgjqo6lGRXkrd21fYCFyeZAX4DeO4rmLcALwa+wGDA+NOq+twi90GSdBIr+1SqqgPAgZGyG4eOn2bwVcrR+54aVy5JOnN8MlaSGmfQS1LjDHpJapxBL0mNM+glqXEGvSQ1zqCXpMYZ9JLUOINekhpn0EtS4wx6SWpcr71uWrB2513PK3v4pquXoSWSdGY5o5ekxhn0ktQ4g16SGmfQS1LjDHpJapxBL0mNM+glqXEGvSQ1zqCXpMYZ9JLUOINekhpn0EtS4wx6SWqcQS9JjTPoJalxBr0kNc6gl6TGGfSS1DiDXpIaZ9BLUuN6BX2SzUkeSjKTZOeY6+cnub27fl+StUPXXpPkU0kOJfl8kgsWr/mSpPnMG/RJVgC3AFcBG4Brk2wYqXYdcLSqLgN2Azd3964EPgr8WlW9Cngj8M1Fa70kaV59ZvSbgJmqOlxVx4DbgC0jdbYA+7vjO4ErkgS4EvhcVX0WoKq+VlXPLk7TJUl99An6S4FHhs5nu7KxdarqOPAkcDHw/UAluTvJZ5K8d9wbJLk+yXSS6bm5uVPtgyTpJPoEfcaUVc86K4E3AO/ofv9skiueV7FqT1VtrKqNq1at6tEkSVJffYJ+FlgzdL4aePREdbp1+QuBI135P1fVE1X1f8AB4IcX2mhJUn99gv4gsD7JuiTnAduAqZE6U8D27ngrcG9VFXA38Jok39kNAD8BfHFxmi5J6mPlfBWq6niSHQxCewWwr6oOJdkFTFfVFLAXuDXJDIOZ/Lbu3qNJPsxgsCjgQFXdtUR9kSSNMW/QA1TVAQbLLsNlNw4dPw1cc4J7P8rgK5aSpGXgk7GS1LheM/qWrd35/JWkh2+6ehlaIklLwxm9JDXOoJekxhn0ktQ4g16SGmfQS1LjDHpJapxBL0mNM+glqXEGvSQ1zqCXpMZN/BYIJ+P2CJJa4Ixekhpn0EtS4wx6SWqcQS9JjTPoJalxBr0kNc6gl6TGGfSS1DiDXpIaZ9BLUuMMeklqnEEvSY0z6CWpce5eeRrc1VLSucQZvSQ1zqCXpMYZ9JLUOINekhrXK+iTbE7yUJKZJDvHXD8/ye3d9fuSrB25/vIkTyV5z+I0W5LU17xBn2QFcAtwFbABuDbJhpFq1wFHq+oyYDdw88j13cDHFt5cSdKp6jOj3wTMVNXhqjoG3AZsGamzBdjfHd8JXJEkAEneBhwGDi1OkyVJp6JP0F8KPDJ0PtuVja1TVceBJ4GLk7wI+C3ggwtvqiTpdPQJ+owpq551PgjsrqqnTvoGyfVJppNMz83N9WiSJKmvPk/GzgJrhs5XA4+eoM5skpXAhcAR4HJga5LfAy4CvpXk6ar6o+Gbq2oPsAdg48aNo4OIJGkB+gT9QWB9knXAfwPbgLeP1JkCtgOfArYC91ZVAT/2XIUkHwCeGg15SdLSmjfoq+p4kh3A3cAKYF9VHUqyC5iuqilgL3BrkhkGM/ltS9loSVJ/vTY1q6oDwIGRshuHjp8GrpnnNT5wGu2TJC2QT8ZKUuMMeklqnPvRLyL3qZd0NnJGL0mNM+glqXEGvSQ1zqCXpMYZ9JLUOL91c4b4jRxJy8UZvSQ1zqCXpMYZ9JLUOINekhpn0EtS4wx6SWqcQS9JjTPoJalxPjC1zHyQStJSc0YvSY1zRn+WGjfTB2f7kk6dM3pJapxBL0mNM+glqXEGvSQ1zqCXpMYZ9JLUOINekhpn0EtS4wx6SWqcQS9JjTPoJalxBr0kNa7XpmZJNgN/CKwAPlJVN41cPx/4c+B1wNeAn6+qh5P8FHATcB5wDPjNqrp3Eds/kdzaWNKpmHdGn2QFcAtwFbABuDbJhpFq1wFHq+oyYDdwc1f+BPAzVfVqYDtw62I1XJLUT5+lm03ATFUdrqpjwG3AlpE6W4D93fGdwBVJUlUPVNWjXfkh4IJu9i9JOkP6BP2lwCND57Nd2dg6VXUceBK4eKTOzwEPVNUzo2+Q5Pok00mm5+bm+rZdktRDnzX6jCmrU6mT5FUMlnOuHPcGVbUH2AOwcePG0ddWT67dSxqnT9DPAmuGzlcDj56gzmySlcCFwBGAJKuBvwV+qaq+vOAW67Q4CEiTq8/SzUFgfZJ1Sc4DtgFTI3WmGHzYCrAVuLeqKslFwF3ADVX1ycVqtCSpv3ln9FV1PMkO4G4GX6/cV1WHkuwCpqtqCtgL3JpkhsFMflt3+w7gMuB9Sd7XlV1ZVY8vdkd0epzpS+3r9T36qjoAHBgpu3Ho+GngmjH3fQj40ALbKElagF5Br8njTF9qh1sgSFLjDHpJapxLNzplLutI5xZn9JLUOINekhrn0o0WjUs60tnJGb0kNc4ZvZbcuJk+ONuXzhRn9JLUOGf0Wlau60tLz6DXWelkA4CDg3RqDHo1wwFAGs+gV/P834EmnUEvjXGiAcCBQecig15aJA4COlsZ9NIScwDQcjPopWVysgfJHBy0mAx66RxyOh8sO2jIoJcm1GIOGg4mZzeDXtKSOp3BwYFjcRn0ks4ZDhqnx6CXNJEmadAw6CVpgU5l0DjZtaUaNNymWJIaZ9BLUuMMeklqnEEvSY0z6CWpcQa9JDXOoJekxhn0ktS4XkGfZHOSh5LMJNk55vr5SW7vrt+XZO3QtRu68oeSvHnxmi5J6mPeoE+yArgFuArYAFybZMNIteuAo1V1GbAbuLm7dwOwDXgVsBn44+71JElnSJ8Z/SZgpqoOV9Ux4DZgy0idLcD+7vhO4Iok6cpvq6pnquo/gZnu9SRJZ0iq6uQVkq3A5qr6le78F4HLq2rHUJ0vdHVmu/MvA5cDHwD+rao+2pXvBT5WVXeOvMf1wPXd6Q8ADy28a1wCPLEIr3OumuT+T3Lfwf5Pav+/r6pWjbvQZ1OzjCkbHR1OVKfPvVTVHmBPj7b0lmS6qjYu5mueSya5/5Pcd7D/k97/cfos3cwCa4bOVwOPnqhOkpXAhcCRnvdKkpZQn6A/CKxPsi7JeQw+XJ0aqTMFbO+OtwL31mBNaArY1n0rZx2wHvj04jRdktTHvEs3VXU8yQ7gbmAFsK+qDiXZBUxX1RSwF7g1yQyDmfy27t5DSe4AvggcB95ZVc8uUV9GLepS0Dlokvs/yX0H+z/p/X+eeT+MlSSd23wyVpIaZ9BLUuOaDPr5tmxoTZJ9SR7vnmd4ruylSe5J8qXu90uWs41LJcmaJJ9I8mCSQ0ne1ZVPSv8vSPLpJJ/t+v/Brnxdtx3Jl7rtSc5b7rYulSQrkjyQ5B+684npe1/NBX3PLRta82cMtpgYthP4eFWtBz7enbfoOPDuqnol8Hrgnd2/96T0/xngTVX1g8Brgc1JXs9gG5LdXf+PMtimpFXvAh4cOp+kvvfSXNDTb8uGplTVvzD4ttOw4W0p9gNvO6ONOkOq6rGq+kx3/D8M/uAvZXL6X1X1VHf6gu6ngDcx2I4EGu5/ktXA1cBHuvMwIX0/FS0G/aXAI0Pns13ZpPmeqnoMBmEIfPcyt2fJdbum/hBwHxPU/27p4t+Bx4F7gC8DX6+q412Vlv8G/gB4L/Ct7vxiJqfvvbUY9L22XVBbkrwY+Gvg16vqG8vdnjOpqp6tqtcyePJ8E/DKcdXObKuWXpK3AI9X1f3DxWOqNtf3U9Vnr5tzjdsuDHw1ycuq6rEkL2Mw22tSkhcwCPm/qKq/6Yonpv/PqaqvJ/knBp9VXJRkZTezbfVv4EeBtyb5aeAC4LsYzPAnoe+npMUZfZ8tGybB8LYU24G/X8a2LJluTXYv8GBVfXjo0qT0f1WSi7rjFwI/yeBzik8w2I4EGu1/Vd1QVaurai2Dv/N7q+odTEDfT1WTT8Z2I/wf8O0tG353mZu0pJL8FfBGBtuzfhV4P/B3wB3Ay4GvANdU1egHtue8JG8A/hX4PN9ep/1tBuv0k9D/1zD4wHEFg4nbHVW1K8krGHwR4aXAA8AvVNUzy9fSpZXkjcB7quotk9b3PpoMeknSt7W4dCNJGmLQS1LjDHpJapxBL0mNM+glqXEGvSQ1zqCXpMb9PyA10MOwhWNfAAAAAElFTkSuQmCC\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "# 绘制主成分占85%方差\n",
    "import matplotlib.pyplot as plt\n",
    "%matplotlib inline\n",
    "print(len(pca.explained_variance_ratio_))\n",
    "plt.bar(range(len(pca.explained_variance_ratio_)), pca.explained_variance_ratio_)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 3. 采用train_test_split，从将数据集中随机抽取10000条记录（原始数据集太大，剩余数据抛弃，此部分SVM作业已经完成）。对这部分数据进行PCA降维，保留85%的能量。（20分） "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "dpath = './data/'\n",
    "train1 = pd.read_csv(dpath +\"Otto_FE_train_org.csv\")\n",
    "train2 = pd.read_csv(dpath +\"Otto_FE_train_tfidf.csv\")\n",
    "\n",
    "train2 = train2.drop([\"id\",\"target\"], axis=1)\n",
    "train =  pd.concat([train1, train2], axis = 1, ignore_index=False)\n",
    "\n",
    "del train1\n",
    "del train2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "y_train = train['target'] \n",
    "X_train = train.drop([\"target\"], axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import train_test_split\n",
    "X_train_part, X_val, y_train_part, y_val = train_test_split(X_train, y_train, train_size = 10000,random_state = 0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(10000, 187)"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X_train_part.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train_part_id=X_train_part[\"id\"]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "X_train_part=X_train_part.drop([\"id\"],axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "pca = PCA(n_components = 0.85)\n",
    "pca.fit(X_train_part)\n",
    "    \n",
    "# 在训练集降维 \n",
    "X_train_pca = pca.transform(X_train_part)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(10000, 53)"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "X_train_pca.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "53\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "<BarContainer object of 53 artists>"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXoAAAD4CAYAAADiry33AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy8QZhcZAAAQUklEQVR4nO3df6xfd13H8efLlg0cZoOuGG2Ht2TVUBRRa0cCKm4yOocrxi52oNZkZprQBAMEi4ljVEiYMcw/mImNmy4gbssUbVxxLgx/hODoHQNGNxouc7JLCbuzZTjMNgpv//ie6pfv7t093f32/vjc5yNp7jmf8znf7+eTffc6n/s53/O5qSokSe36nqVugCTp9DLoJalxBr0kNc6gl6TGGfSS1Li1S92AUeeee25NTEwsdTMkaUW55557Hq2q9bMdW3ZBPzExweTk5FI3Q5JWlCT/Odcxp24kqXEGvSQ1zqCXpMYZ9JLUOINekhpn0EtS4wx6SWqcQS9JjTPoJalxy+7J2IWa2Hv7d+0/9L5Ll6glkrQ89BrRJ9me5EiSqSR7Zzn+s0k+neREkp0jx3Yn+WL3b/e4Gi5J6mfeoE+yBrgeuATYAlyRZMtItS8Dvwl8eOTcFwLvAi4AtgHvSvKChTdbktRXnxH9NmCqqh6sqqeAm4EdwxWq6qGq+hzwnZFzXwfcWVXHquo4cCewfQztliT11CfoNwAPD+1Pd2V99Do3yVVJJpNMzszM9HxpSVIffYI+s5RVz9fvdW5V7a+qrVW1df36WZdTliQ9S32Cfho4b2h/I3C05+sv5FxJ0hj0CfpDwOYkm5KcAewCDvR8/TuAi5O8oLsJe3FXJklaJPMGfVWdAPYwCOgHgFur6nCSfUkuA0jy00mmgcuBP0tyuDv3GPCHDC4Wh4B9XZkkaZH0emCqqg4CB0fKrh7aPsRgWma2c28EblxAGyVJC+ASCJLUOINekhpn0EtS4wx6SWqcQS9JjTPoJalxBr0kNc6gl6TGGfSS1DiDXpIaZ9BLUuMMeklqnEEvSY0z6CWpcQa9JDXOoJekxhn0ktQ4g16SGmfQS1LjDHpJapxBL0mNM+glqXEGvSQ1zqCXpMYZ9JLUOINekhpn0EtS4wx6SWqcQS9JjTPoJalxBr0kNc6gl6TG9Qr6JNuTHEkylWTvLMfPTHJLd/zuJBNd+XOS3JTkviQPJHnneJsvSZrPvEGfZA1wPXAJsAW4IsmWkWpXAser6nzgOuDarvxy4Myq+jHgp4DfPnkRkCQtjj4j+m3AVFU9WFVPATcDO0bq7ABu6rZvAy5KEqCAs5KsBZ4HPAV8YywtlyT10ifoNwAPD+1Pd2Wz1qmqE8BjwDoGof9N4KvAl4E/rqpjo2+Q5Kokk0kmZ2ZmTrkTkqS59Qn6zFJWPetsA74N/CCwCXhbkpc8rWLV/qraWlVb169f36NJkqS++gT9NHDe0P5G4OhcdbppmrOBY8AbgX+sqm9V1SPAJ4CtC220JKm/PkF/CNicZFOSM4BdwIGROgeA3d32TuCuqioG0zUXZuAs4JXAF8bTdElSH/MGfTfnvge4A3gAuLWqDifZl+SyrtoNwLokU8BbgZNfwbweeD7weQYXjL+oqs+NuQ+SpGewtk+lqjoIHBwpu3po+wkGX6UcPe/x2colSYvHJ2MlqXEGvSQ1zqCXpMYZ9JLUOINekhpn0EtS4wx6SWqcQS9JjTPoJalxvZ6MXekm9t7+tLKH3nfpErREkhafI3pJapxBL0mNM+glqXEGvSQ1zqCXpMYZ9JLUOINekhpn0EtS4wx6SWqcQS9JjTPoJalxBr0kNc6gl6TGGfSS1DiDXpIaZ9BLUuMMeklqnEEvSY0z6CWpcQa9JDXOoJekxvUK+iTbkxxJMpVk7yzHz0xyS3f87iQTQ8denuSTSQ4nuS/Jc8fXfEnSfOYN+iRrgOuBS4AtwBVJtoxUuxI4XlXnA9cB13bnrgU+BPxOVb0MeA3wrbG1XpI0rz4j+m3AVFU9WFVPATcDO0bq7ABu6rZvAy5KEuBi4HNV9VmAqvqvqvr2eJouSeqjT9BvAB4e2p/uymatU1UngMeAdcAPA5XkjiSfTvKO2d4gyVVJJpNMzszMnGofJEnPoE/QZ5ay6llnLfBq4E3dz19OctHTKlbtr6qtVbV1/fr1PZokSeqrT9BPA+cN7W8Ejs5Vp5uXPxs41pX/S1U9WlX/AxwEfnKhjZYk9dcn6A8Bm5NsSnIGsAs4MFLnALC7294J3FVVBdwBvDzJ93YXgJ8D7h9P0yVJfaydr0JVnUiyh0ForwFurKrDSfYBk1V1ALgB+GCSKQYj+V3duceTvJ/BxaKAg1V1+2nqiyRpFvMGPUBVHWQw7TJcdvXQ9hPA5XOc+yEGX7GUJC0Bn4yVpMb1GtG3amLv02eRHnrfpUvQEkk6fRzRS1LjDHpJapxBL0mNM+glqXEGvSQ1zqCXpMYZ9JLUOINekhpn0EtS4wx6SWqcQS9JjTPoJalxq3pRs7mMLnbmQmeSVjJH9JLUOINekhpn0EtS4wx6SWqcQS9JjTPoJalxBr0kNc6gl6TGGfSS1DiDXpIaZ9BLUuMMeklqnEEvSY0z6CWpcQa9JDXOoJekxhn0ktS4XkGfZHuSI0mmkuyd5fiZSW7pjt+dZGLk+IuTPJ7k7eNptiSpr3mDPska4HrgEmALcEWSLSPVrgSOV9X5wHXAtSPHrwM+uvDmSpJOVZ+/GbsNmKqqBwGS3AzsAO4fqrMDuKbbvg34QJJUVSV5A/Ag8M2xtXqJ+LdkJa1EfaZuNgAPD+1Pd2Wz1qmqE8BjwLokZwG/B7z7md4gyVVJJpNMzszM9G27JKmHPkGfWcqqZ513A9dV1ePP9AZVtb+qtlbV1vXr1/dokiSprz5TN9PAeUP7G4Gjc9SZTrIWOBs4BlwA7EzyR8A5wHeSPFFVH1hwyyVJvfQJ+kPA5iSbgK8Au4A3jtQ5AOwGPgnsBO6qqgJ+5mSFJNcAjxvykrS45g36qjqRZA9wB7AGuLGqDifZB0xW1QHgBuCDSaYYjOR3nc5GS5L66zOip6oOAgdHyq4e2n4CuHye17jmWbRPkrRAPhkrSY0z6CWpcQa9JDWu1xy95jb6tCz4xKyk5cURvSQ1zqCXpMYZ9JLUOINekhpn0EtS4wx6SWqcX688TfzapaTlwhG9JDXOoJekxhn0ktQ4g16SGmfQS1LjDHpJapxBL0mN83v0i8zv10tabI7oJalxBr0kNc6gl6TGOUe/TIzO3TtvL2lcHNFLUuMMeklqnEEvSY0z6CWpcQa9JDXOoJekxhn0ktQ4g16SGtcr6JNsT3IkyVSSvbMcPzPJLd3xu5NMdOWvTXJPkvu6nxeOt/mSpPnM+2RskjXA9cBrgWngUJIDVXX/ULUrgeNVdX6SXcC1wK8CjwK/VFVHk/wocAewYdydaJUrXUoahz4j+m3AVFU9WFVPATcDO0bq7ABu6rZvAy5Kkqq6t6qOduWHgecmOXMcDZck9dMn6DcADw/tT/P0Ufn/1amqE8BjwLqROr8C3FtVT46+QZKrkkwmmZyZmenbdklSD30WNcssZXUqdZK8jMF0zsWzvUFV7Qf2A2zdunX0tTXCKR1Jp6LPiH4aOG9ofyNwdK46SdYCZwPHuv2NwEeA36iqLy20wZKkU9NnRH8I2JxkE/AVYBfwxpE6B4DdwCeBncBdVVVJzgFuB95ZVZ8YX7M1G0f6kmYz74i+m3Pfw+AbMw8At1bV4ST7klzWVbsBWJdkCngrcPIrmHuA84E/SPKZ7t+Lxt4LSdKcev3hkao6CBwcKbt6aPsJ4PJZznsP8J4FtlGStAD+halVwL9eJa1uBv0q5Xy+tHq41o0kNc6gl6TGOXWj7+KUjtQeR/SS1DiDXpIa59SNenFKR1q5HNFLUuMMeklqnFM3WhCfupWWP4Nep4UXAGn5MOi1aOa6oeuNXun0Mui1bM31W4G/LUinxqBXE/xtQZqbQa9VyQuAVhODXhoy27SQFwWtdAa99Cx5AdBKYdBLY9b3foEXBS0Wg15aQt5E1mIw6KUV5FR/W/Ceg8Cgl9RxyqldBr2kZ+VUflvwIrK0DHpJy87puIicLF+NDHpJq8rpvIgs199QDHpJOs2W+qa4f3hEkhpn0EtS4wx6SWqcQS9JjTPoJalxBr0kNa5X0CfZnuRIkqkke2c5fmaSW7rjdyeZGDr2zq78SJLXja/pkqQ+5g36JGuA64FLgC3AFUm2jFS7EjheVecD1wHXduduAXYBLwO2A3/avZ4kaZH0GdFvA6aq6sGqegq4GdgxUmcHcFO3fRtwUZJ05TdX1ZNV9R/AVPd6kqRFkqp65grJTmB7Vf1Wt//rwAVVtWeozue7OtPd/peAC4BrgH+vqg915TcAH62q20be4yrgqm73R4AjC+8a5wKPjuF1lrPV0Eewny1ZDX2EpennD1XV+tkO9FkCIbOUjV4d5qrT51yqaj+wv0dbeksyWVVbx/may81q6CPYz5ashj7C8utnn6mbaeC8of2NwNG56iRZC5wNHOt5riTpNOoT9IeAzUk2JTmDwc3VAyN1DgC7u+2dwF01mBM6AOzqvpWzCdgMfGo8TZck9THv1E1VnUiyB7gDWAPcWFWHk+wDJqvqAHAD8MEkUwxG8ru6cw8nuRW4HzgBvLmqvn2a+jJqrFNBy9Rq6CPYz5ashj7CMuvnvDdjJUkrm0/GSlLjDHpJalxzQT/fcg0rVZIbkzzSPbNwsuyFSe5M8sXu5wuWso3jkOS8JB9P8kCSw0ne0pU309ckz03yqSSf7fr47q58U7eEyBe7JUXOWOq2jkOSNUnuTfIP3X5z/UzyUJL7knwmyWRXtmw+s00Ffc/lGlaqv2SwjMSwvcDHqmoz8LFuf6U7Abytql4KvBJ4c/ffsKW+PglcWFU/DrwC2J7klQyWDrmu6+NxBkuLtOAtwAND+6328+er6hVD359fNp/ZpoKefss1rEhV9a8MvtE0bHjpiZuANyxqo06DqvpqVX262/5vBgGxgYb6WgOPd7vP6f4VcCGDJURghffxpCQbgUuBP+/2Q4P9nMOy+cy2FvQbgIeH9qe7slZ9f1V9FQYBCbxoidszVt0qqD8B3E1jfe2mMz4DPALcCXwJ+HpVneiqtPLZ/RPgHcB3uv11tNnPAv4pyT3dki6wjD6zfZZAWEl6Lbmg5S/J84G/AX63qr4xGAi2o3ue5BVJzgE+Arx0tmqL26rxSvJ64JGquifJa04Wz1J1Rfez86qqOprkRcCdSb6w1A0a1tqIfrUtufC1JD8A0P18ZInbMxZJnsMg5P+qqv62K26yr1X1deCfGdyPOKdbQgTa+Oy+CrgsyUMMplEvZDDCb62fVNXR7ucjDC7c21hGn9nWgr7Pcg0tGV56Yjfw90vYlrHo5nBvAB6oqvcPHWqmr0nWdyN5kjwP+AUG9yI+zmAJEVjhfQSoqndW1caqmmDw/+JdVfUmGutnkrOSfN/JbeBi4PMso89sc0/GJvlFBqOGk8s1vHeJmzQWSf4aeA2D5U+/BrwL+DvgVuDFwJeBy6tq9IbtipLk1cC/Affx//O6v89gnr6JviZ5OYObc2sYDLZurap9SV7CYOT7QuBe4Neq6smla+n4dFM3b6+q17fWz64/H+l21wIfrqr3JlnHMvnMNhf0kqTv1trUjSRphEEvSY0z6CWpcQa9JDXOoJekxhn0ktQ4g16SGve/o1AO1YXSGcAAAAAASUVORK5CYII=\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "# 绘制主成分占85%方差\n",
    "print(len(pca.explained_variance_ratio_))\n",
    "plt.bar(range(len(pca.explained_variance_ratio_)), pca.explained_variance_ratio_)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 4. 对3中得到的数据（对降维后的数据），训练RBF核SVM，并对超参数（C和gamma）进行超参数调优。结果和用原始数据的情况比较（SVM部分作业结果）。（30分）"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 使用3中得到的降维后的数据的结果"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "#将PCA降维后的特征组成新的数据\n",
    "n_components = pca.n_components_# 主成分的个数𝐾\n",
    "feat_names_pca = []\n",
    "for i in range(n_components):\n",
    "    feat_names_pca.append(\"pca_\" + str(i))\n",
    "\n",
    "y = pd.Series(data =  y_train_part.values, name = 'target')\n",
    "id_=pd.Series(data= X_train_part_id.values,name = 'id')\n",
    "\n",
    "\n",
    "train_pca = pd.concat([id_, pd.DataFrame(columns = feat_names_pca, data = X_train_pca), y], axis = 1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(10000, 55)"
      ]
     },
     "execution_count": 20,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_pca.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>pca_0</th>\n",
       "      <th>pca_1</th>\n",
       "      <th>pca_2</th>\n",
       "      <th>pca_3</th>\n",
       "      <th>pca_4</th>\n",
       "      <th>pca_5</th>\n",
       "      <th>pca_6</th>\n",
       "      <th>pca_7</th>\n",
       "      <th>pca_8</th>\n",
       "      <th>...</th>\n",
       "      <th>pca_44</th>\n",
       "      <th>pca_45</th>\n",
       "      <th>pca_46</th>\n",
       "      <th>pca_47</th>\n",
       "      <th>pca_48</th>\n",
       "      <th>pca_49</th>\n",
       "      <th>pca_50</th>\n",
       "      <th>pca_51</th>\n",
       "      <th>pca_52</th>\n",
       "      <th>target</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>7898</td>\n",
       "      <td>0.625286</td>\n",
       "      <td>0.131340</td>\n",
       "      <td>-0.002001</td>\n",
       "      <td>0.173749</td>\n",
       "      <td>0.004266</td>\n",
       "      <td>0.035673</td>\n",
       "      <td>0.201813</td>\n",
       "      <td>-0.035828</td>\n",
       "      <td>-0.104780</td>\n",
       "      <td>...</td>\n",
       "      <td>-0.081093</td>\n",
       "      <td>-0.036612</td>\n",
       "      <td>-0.032685</td>\n",
       "      <td>0.026958</td>\n",
       "      <td>0.023477</td>\n",
       "      <td>-0.031667</td>\n",
       "      <td>-0.021681</td>\n",
       "      <td>0.014094</td>\n",
       "      <td>0.040825</td>\n",
       "      <td>Class_2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>11288</td>\n",
       "      <td>0.476990</td>\n",
       "      <td>0.013569</td>\n",
       "      <td>-0.290358</td>\n",
       "      <td>0.009830</td>\n",
       "      <td>0.053649</td>\n",
       "      <td>-0.085235</td>\n",
       "      <td>0.015529</td>\n",
       "      <td>-0.417157</td>\n",
       "      <td>-0.044583</td>\n",
       "      <td>...</td>\n",
       "      <td>0.008607</td>\n",
       "      <td>0.012202</td>\n",
       "      <td>-0.012626</td>\n",
       "      <td>-0.053581</td>\n",
       "      <td>0.087355</td>\n",
       "      <td>0.009804</td>\n",
       "      <td>-0.011665</td>\n",
       "      <td>-0.019390</td>\n",
       "      <td>0.047540</td>\n",
       "      <td>Class_2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>10356</td>\n",
       "      <td>0.142455</td>\n",
       "      <td>-0.238999</td>\n",
       "      <td>0.004232</td>\n",
       "      <td>-0.113067</td>\n",
       "      <td>-0.035355</td>\n",
       "      <td>-0.191490</td>\n",
       "      <td>-0.234224</td>\n",
       "      <td>-0.405027</td>\n",
       "      <td>0.267705</td>\n",
       "      <td>...</td>\n",
       "      <td>0.035690</td>\n",
       "      <td>-0.022022</td>\n",
       "      <td>-0.055516</td>\n",
       "      <td>-0.078802</td>\n",
       "      <td>-0.089903</td>\n",
       "      <td>0.081683</td>\n",
       "      <td>-0.128651</td>\n",
       "      <td>-0.144019</td>\n",
       "      <td>0.000786</td>\n",
       "      <td>Class_2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>13439</td>\n",
       "      <td>0.600814</td>\n",
       "      <td>0.076997</td>\n",
       "      <td>-0.239394</td>\n",
       "      <td>0.054835</td>\n",
       "      <td>0.007561</td>\n",
       "      <td>-0.088206</td>\n",
       "      <td>0.020324</td>\n",
       "      <td>-0.282483</td>\n",
       "      <td>-0.057967</td>\n",
       "      <td>...</td>\n",
       "      <td>0.123055</td>\n",
       "      <td>-0.091573</td>\n",
       "      <td>-0.005661</td>\n",
       "      <td>-0.027059</td>\n",
       "      <td>0.032352</td>\n",
       "      <td>0.083729</td>\n",
       "      <td>0.069337</td>\n",
       "      <td>0.025704</td>\n",
       "      <td>-0.095524</td>\n",
       "      <td>Class_2</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>54130</td>\n",
       "      <td>-0.133692</td>\n",
       "      <td>-0.167084</td>\n",
       "      <td>-0.076314</td>\n",
       "      <td>-0.079330</td>\n",
       "      <td>0.105393</td>\n",
       "      <td>-0.226724</td>\n",
       "      <td>0.056367</td>\n",
       "      <td>0.039180</td>\n",
       "      <td>-0.224581</td>\n",
       "      <td>...</td>\n",
       "      <td>-0.030246</td>\n",
       "      <td>0.356076</td>\n",
       "      <td>0.198616</td>\n",
       "      <td>0.256928</td>\n",
       "      <td>-0.029483</td>\n",
       "      <td>-0.098445</td>\n",
       "      <td>-0.002309</td>\n",
       "      <td>0.016936</td>\n",
       "      <td>-0.085946</td>\n",
       "      <td>Class_8</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>5 rows × 55 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "      id     pca_0     pca_1     pca_2     pca_3     pca_4     pca_5  \\\n",
       "0   7898  0.625286  0.131340 -0.002001  0.173749  0.004266  0.035673   \n",
       "1  11288  0.476990  0.013569 -0.290358  0.009830  0.053649 -0.085235   \n",
       "2  10356  0.142455 -0.238999  0.004232 -0.113067 -0.035355 -0.191490   \n",
       "3  13439  0.600814  0.076997 -0.239394  0.054835  0.007561 -0.088206   \n",
       "4  54130 -0.133692 -0.167084 -0.076314 -0.079330  0.105393 -0.226724   \n",
       "\n",
       "      pca_6     pca_7     pca_8  ...    pca_44    pca_45    pca_46    pca_47  \\\n",
       "0  0.201813 -0.035828 -0.104780  ... -0.081093 -0.036612 -0.032685  0.026958   \n",
       "1  0.015529 -0.417157 -0.044583  ...  0.008607  0.012202 -0.012626 -0.053581   \n",
       "2 -0.234224 -0.405027  0.267705  ...  0.035690 -0.022022 -0.055516 -0.078802   \n",
       "3  0.020324 -0.282483 -0.057967  ...  0.123055 -0.091573 -0.005661 -0.027059   \n",
       "4  0.056367  0.039180 -0.224581  ... -0.030246  0.356076  0.198616  0.256928   \n",
       "\n",
       "     pca_48    pca_49    pca_50    pca_51    pca_52   target  \n",
       "0  0.023477 -0.031667 -0.021681  0.014094  0.040825  Class_2  \n",
       "1  0.087355  0.009804 -0.011665 -0.019390  0.047540  Class_2  \n",
       "2 -0.089903  0.081683 -0.128651 -0.144019  0.000786  Class_2  \n",
       "3  0.032352  0.083729  0.069337  0.025704 -0.095524  Class_2  \n",
       "4 -0.029483 -0.098445 -0.002309  0.016936 -0.085946  Class_8  \n",
       "\n",
       "[5 rows x 55 columns]"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_pca.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "x_trian_pca=train_pca.drop([\"id\",\"target\"], axis=1)\n",
    "y_trian_pca=train_pca['target']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.svm import SVC\n",
    "from sklearn.model_selection import GridSearchCV\n",
    "C_s = np.logspace(-1, 3, 5)\n",
    "gamma_s = np.logspace(-1, 1, 3) \n",
    "params_gird = dict(gamma =gamma_s,C=C_s)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "GridSearchCV(cv=5, error_score='raise-deprecating',\n",
       "             estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,\n",
       "                           decision_function_shape='ovr', degree=3,\n",
       "                           gamma='auto_deprecated', kernel='rbf', max_iter=-1,\n",
       "                           probability=False, random_state=None, shrinking=True,\n",
       "                           tol=0.001, verbose=False),\n",
       "             iid='warn', n_jobs=None,\n",
       "             param_grid={'C': array([1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03]),\n",
       "                         'gamma': array([ 0.1,  1. , 10. ])},\n",
       "             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,\n",
       "             scoring='accuracy', verbose=0)"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "SVC2 = SVC(kernel='rbf')\n",
    "grid_=GridSearchCV(SVC2,params_gird,cv=5,scoring='accuracy')\n",
    "grid_.fit(x_trian_pca, y_trian_pca)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'C': 10.0, 'gamma': 1.0}\n",
      "0.7793\n"
     ]
    }
   ],
   "source": [
    "print(grid_.best_params_)\n",
    "print(grid_.best_score_)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 使用为降维的原始数据得到的结果如下"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "GridSearchCV(cv=5, error_score='raise-deprecating',\n",
       "             estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,\n",
       "                           decision_function_shape='ovr', degree=3,\n",
       "                           gamma='auto_deprecated', kernel='rbf', max_iter=-1,\n",
       "                           probability=False, random_state=None, shrinking=True,\n",
       "                           tol=0.001, verbose=False),\n",
       "             iid='warn', n_jobs=None,\n",
       "             param_grid={'C': array([1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03]),\n",
       "                         'gamma': array([ 0.1,  1. , 10. ])},\n",
       "             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,\n",
       "             scoring='accuracy', verbose=0)"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "SVC2 = SVC(kernel='rbf')\n",
    "grid=GridSearchCV(SVC2,params_gird,cv=5,scoring='accuracy')\n",
    "grid.fit(X_train_part, y_train_part)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'C': 10.0, 'gamma': 1.0}\n",
      "0.7838\n"
     ]
    }
   ],
   "source": [
    "print(grid.best_params_)\n",
    "print(grid.best_score_)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "从上面的结果对比可以看出，通过PCA降维后得到的结果，与利用原始数据得到的结果相比，十分接近，差别极小，仅不到1%，说明降维确实利用较少的数据、较少的计算复杂度得到精确度相近的结果。"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
